Food delivery time prediction

Features:


1 ID
2 Delivery_person_ID
3 Delivery_person_Age
4 Delivery_person_Ratings
5 Restaurant_latitude
6 Restaurant_longitude
7 Delivery_location_latitude
8 Delivery_location_longitude
9 Type_of_order
10 Type_of_vehicle
11 Time_taken(min)
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
In [2]:
df=pd.read_csv('delivery.csv')
In [3]:
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45593 non-null  object 
 1   Delivery_person_ID           45593 non-null  object 
 2   Delivery_person_Age          45593 non-null  int64  
 3   Delivery_person_Ratings      45593 non-null  float64
 4   Restaurant_latitude          45593 non-null  float64
 5   Restaurant_longitude         45593 non-null  float64
 6   Delivery_location_latitude   45593 non-null  float64
 7   Delivery_location_longitude  45593 non-null  float64
 8   Type_of_order                45593 non-null  object 
 9   Type_of_vehicle              45593 non-null  object 
 10  Time_taken(min)              45593 non-null  int64  
dtypes: float64(5), int64(2), object(4)
memory usage: 3.8+ MB
None
In [4]:
df.shape
Out[4]:
(45593, 11)
In [5]:
df.dtypes
Out[5]:
ID                              object
Delivery_person_ID              object
Delivery_person_Age              int64
Delivery_person_Ratings        float64
Restaurant_latitude            float64
Restaurant_longitude           float64
Delivery_location_latitude     float64
Delivery_location_longitude    float64
Type_of_order                   object
Type_of_vehicle                 object
Time_taken(min)                  int64
dtype: object
In [6]:
df.isnull().sum()
Out[6]:
ID                             0
Delivery_person_ID             0
Delivery_person_Age            0
Delivery_person_Ratings        0
Restaurant_latitude            0
Restaurant_longitude           0
Delivery_location_latitude     0
Delivery_location_longitude    0
Type_of_order                  0
Type_of_vehicle                0
Time_taken(min)                0
dtype: int64

Dataset doesnot contain duplicate or null values

In [7]:
df.sample(frac=1)
Out[7]:
ID Delivery_person_ID Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Type_of_order Type_of_vehicle Time_taken(min)
27098 B815 COIMBRES14DEL01 21 4.9 11.003681 76.975525 11.113681 77.085525 Snack scooter 27
18223 B248 MYSRES09DEL02 31 4.9 12.323194 76.630583 12.343194 76.650583 Snack motorcycle 32
21065 B4EC MYSRES08DEL03 38 5.0 12.297954 76.665169 12.387954 76.755169 Meal electric_scooter 43
36111 5A80 JAPRES16DEL03 39 4.3 26.849596 75.800512 26.939596 75.890512 Buffet scooter 34
17974 C76A LUDHRES15DEL01 37 4.4 30.899584 75.809346 30.909584 75.819346 Meal motorcycle 39
... ... ... ... ... ... ... ... ... ... ... ...
23182 B48B MUMRES03DEL02 26 4.7 19.223840 72.841347 19.303840 72.921347 Drinks motorcycle 31
29111 1EB6 HYDRES15DEL02 29 4.6 17.459710 78.368855 17.479710 78.388855 Buffet motorcycle 19
10137 8911 CHENRES03DEL01 38 4.4 13.091809 80.219104 13.161809 80.289104 Drinks motorcycle 33
36862 D5D1 AURGRES06DEL02 26 4.6 19.874449 75.360232 19.954449 75.440232 Snack motorcycle 20
2331 360F RANCHIRES02DEL02 24 5.0 0.000000 0.000000 0.020000 0.020000 Buffet motorcycle 17

45593 rows × 11 columns

Calculation of distance between the restaurant and delivery location using using the haversine formula

In [8]:
# Set the earth's radius (in kilometers)
R = 6371
In [9]:
# Convert degrees to radian
def deg_to_rad(degrees):
    return degrees * (np.pi/180)
In [10]:
# Function to calculate the distance between two points using the haversine formula
def distcalculate(lat1, lon1, lat2, lon2):
    d_lat = deg_to_rad(lat2-lat1)
    d_lon = deg_to_rad(lon2-lon1)
    a = np.sin(d_lat/2)**2 + np.cos(deg_to_rad(lat1)) * np.cos(deg_to_rad(lat2)) * np.sin(d_lon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c
In [11]:
# Calculate the distance between each pair of points
df['distance'] = np.nan

for i in range(len(df)):
    df.loc[i, 'distance'] = distcalculate(df.loc[i, 'Restaurant_latitude'], 
                                        df.loc[i, 'Restaurant_longitude'], 
                                        df.loc[i, 'Delivery_location_latitude'], 
                                        df.loc[i, 'Delivery_location_longitude'])
In [12]:
df.sample(frac=1)
Out[12]:
ID Delivery_person_ID Delivery_person_Age Delivery_person_Ratings Restaurant_latitude Restaurant_longitude Delivery_location_latitude Delivery_location_longitude Type_of_order Type_of_vehicle Time_taken(min) distance
20015 82CE COIMBRES010DEL02 39 4.1 11.003008 76.975440 11.133009 77.105440 Meal scooter 47 20.253789
38820 3FB1 VADRES11DEL01 27 4.6 22.308096 73.167753 22.348096 73.207753 Drinks scooter 25 6.058919
20575 2.5E+05 SURRES06DEL01 30 3.8 21.185047 72.808590 21.295047 72.918590 Snack motorcycle 40 16.720679
35700 97A7 PUNERES09DEL02 21 4.6 18.536562 73.896485 18.556562 73.916485 Meal scooter 27 3.064487
8685 4272 SURRES03DEL02 38 4.5 21.186884 72.793616 21.236884 72.843616 Drinks scooter 25 7.600984
... ... ... ... ... ... ... ... ... ... ... ... ...
31017 9067 INDORES02DEL02 22 5.0 22.651847 75.881991 22.671847 75.901991 Drinks scooter 18 3.026096
34025 7806 JAPRES18DEL01 31 4.3 26.913987 75.752891 26.953987 75.792891 Buffet motorcycle 39 5.958760
25563 25B8 COIMBRES12DEL03 30 4.8 11.000762 76.981876 11.060762 77.041876 Meal scooter 28 9.348449
21031 74FA HYDRES19DEL01 30 4.5 17.458998 78.500366 17.568998 78.610366 Buffet electric_scooter 31 16.901691
11322 8F57 INDORES07DEL02 23 4.6 22.722634 75.886959 22.742634 75.906959 Buffet scooter 34 3.025377

45593 rows × 12 columns

In [13]:
figure = px.scatter(data_frame = df, 
                    x="distance",
                    y="Time_taken(min)", 
                    size="Time_taken(min)", 
                    trendline="ols", 
                    title = "Relationship Between Distance and Time Taken")
figure.show()

There is a consistent relationship between the time taken and the distance travelled to deliver the food. It means that most delivery partners deliver food within 25-30 minutes, regardless of distance.

In [13]:
figure = px.scatter(data_frame = df, 
                    x="Delivery_person_Age",
                    y="Time_taken(min)", 
                    size="Time_taken(min)", 
                    color = "distance",
                    trendline="ols", 
                    title = "Relationship Between Time Taken and Age")
figure.show()

There is a linear relationship between the time taken to deliver the food and the age of the delivery partner. It means young delivery partners take less time to deliver the food compared to the elder partners.

In [14]:
figure = px.scatter(data_frame = df,x="Delivery_person_Ratings",y="Time_taken(min)",size="Time_taken(min)",color = "distance",
                    trendline="ols", 
                    title = "Relationship Between Time Taken and Ratings")
figure.show()

There is an inverse linear relationship between the time taken to deliver the food and the ratings of the delivery partner. It means delivery partners with higher ratings take less time to deliver the food compared to partners with low ratings

In [15]:
fig = px.box(df,x="Type_of_vehicle",y="Time_taken(min)", color="Type_of_order")
fig.show()

there is not much difference between the time taken by delivery partners depending on the vehicle they are driving and the type of food they are delivering.

The features that contribute most to the food delivery time based on our analysis are:
-age of the delivery partner
-ratings of the delivery partner
-distance between the restaurant and the delivery location

In [16]:
#splitting data
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from math import sqrt
In [17]:
x = df[["Delivery_person_Age","Delivery_person_Ratings","distance"]]
y = df["Time_taken(min)"]
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,random_state=42)
In [18]:
print(x.shape)
print(y.shape)
(45593, 3)
(45593,)
In [19]:
x_train.head(5)
Out[19]:
Delivery_person_Age Delivery_person_Ratings distance
29044 37 4.8 1.514805
41736 28 4.2 4.663310
17874 24 4.6 10.586541
42093 34 4.0 6.058874
22952 28 4.8 17.297866
In [20]:
y_train.head(5)
Out[20]:
29044    28
41736    16
17874    28
42093    32
22952    26
Name: Time_taken(min), dtype: int64

Random Forest Regressor

In [21]:
from sklearn.ensemble import RandomForestRegressor
In [22]:
rfr=RandomForestRegressor()
In [23]:
rfr.fit(x_train,y_train)
Out[23]:
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor()
In [24]:
y_pred=rfr.predict(x_test)
In [25]:
print("Mean Absolute Error =",mean_absolute_error(y_pred,y_test))
print("Mean Squarred Error =",mean_squared_error(y_pred,y_test))
print("Root Mean Square Error =",sqrt(mean_squared_error(y_pred,y_test)))
Mean Absolute Error = 6.338063968296876
Mean Squarred Error = 66.69381612232372
Root Mean Square Error = 8.166628197874795

XG Boost Regressor

In [26]:
from xgboost import XGBRegressor
In [27]:
xgbr=XGBRegressor()
In [28]:
xgbr.fit(x_train,y_train)
Out[28]:
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)
In [29]:
y_pred=xgbr.predict(x_test)
In [30]:
print("Mean Absolute Error =",mean_absolute_error(y_pred,y_test))
print("Mean Squarred Error =",mean_squared_error(y_pred,y_test))
print("Root Mean Square Error =",sqrt(mean_squared_error(y_pred,y_test)))
Mean Absolute Error = 5.825209372163093
Mean Squarred Error = 55.21338319296294
Root Mean Square Error = 7.430570852428697
In [31]:
print("Food Delivery Time Prediction")
a = int(input("Age of Delivery Partner: "))
b = float(input("Ratings of Previous Deliveries: "))
c = int(input("Total Distance: "))
features = np.array([[a, b, c]])
print("Predicted Delivery Time in Minutes(RFR) = ", rfr.predict(features))
print("Predicted Delivery Time in Minutes(XGBR) = ", xgbr.predict(features))
Food Delivery Time Prediction
Age of Delivery Partner: 24
Ratings of Previous Deliveries: 3
Total Distance: 30
Predicted Delivery Time in Minutes(RFR) =  [35.7]
Predicted Delivery Time in Minutes(XGBR) =  [35.189297]
C:\Users\rajes\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py:439: UserWarning:

X does not have valid feature names, but RandomForestRegressor was fitted with feature names

In [32]:
import pickle
In [33]:
with open('rfr_model.pkl', 'wb') as f:
    pickle.dump(rfr, f)
In [34]:
with open('xgbr_model.pkl', 'wb') as f:
    pickle.dump(xgbr, f)
In [35]:
 
In [ ]: